{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b1aa2657-faca-4a11-823e-91da881a0692",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import (\n",
    "    decoders,\n",
    "    models,\n",
    "    normalizers,\n",
    "    pre_tokenizers,\n",
    "    processors,\n",
    "    trainers,\n",
    "    Tokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0cb845c3-83f3-43a9-aab4-7b58af368326",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer(models.BPE())\n",
    "tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) #use_regex=False,空格当成一般字符串"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "365ff8ab-f41e-43d5-9fed-6924df8328b9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=[\"<|endoftext|>\"]) #\n",
    "tokenizer.train([\"human2.fna.line.train\",\"human2.fna.line.valid\"], trainer=trainer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0dcb2e8e-8aa9-453c-a124-26e646f49f31",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['TGGCGTGAACCCGGG', 'ATCGGG']\n"
     ]
    }
   ],
   "source": [
    "encoding = tokenizer.encode(\"TGGCGTGAACCCGGGATCGGG\")\n",
    "print(encoding.tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "11a3ec6c-1daa-4656-bd7c-746fa9208ce3",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.save(\"human2_formal.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b217c817-9f25-44b6-9fde-9815acf2a25a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#然后我们可以使用from_file() 方法从该文件里重新加载 Tokenizer 对象：\n",
    "new_tokenizer = Tokenizer.from_file(\"human2_formal.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "33efa23a-e393-447d-8472-cc5898d788e9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['TGGCGTGAACCCGGG', 'ATCGGG']\n"
     ]
    }
   ],
   "source": [
    "encoding = new_tokenizer.encode(\"TGGCGTGAACCCGGGATCGGG\")\n",
    "print(encoding.tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8d08303b-a212-46b3-8f77-7cdf99aa2a71",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/liming/anaconda3/envs/pytorch/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "#要在 🤗 Transformers 中使用这个标记器，我们必须将它包裹在一个 PreTrainedTokenizerFast 类中\n",
    "from transformers import PreTrainedTokenizerFast\n",
    "\n",
    "wrapped_tokenizer = PreTrainedTokenizerFast(\n",
    "    tokenizer_object=new_tokenizer,\n",
    "    bos_token=\"<|endoftext|>\",\n",
    "    eos_token=\"<|endoftext|>\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "beed1b05-dbbe-4b8f-bd42-acec8121e1b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "#或者下面方法\n",
    "from transformers import GPT2TokenizerFast\n",
    "wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=new_tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58a9774a-5c48-4c93-833d-98752c4c1f22",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
